{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Getting Started with Nemesis Data\n",
    "\n",
    "This notebook demonstrates how to connect to and query Nemesis data using Hasura GraphQL.\n",
    "\n",
    "## Setup\n",
    "\n",
    "First, let's import the required libraries and set up our connection to Hasura."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from gql import Client, gql\n",
    "from gql.transport.requests import RequestsHTTPTransport\n",
    "\n",
    "# Set up the GraphQL client\n",
    "hasura_url = os.getenv(\"HASURA_GRAPHQL_URL\", \"http://hasura:8080/v1/graphql\")\n",
    "admin_secret = os.getenv(\"HASURA_ADMIN_SECRET\", \"\")\n",
    "\n",
    "transport = RequestsHTTPTransport(url=hasura_url, headers={\"x-hasura-admin-secret\": admin_secret})\n",
    "\n",
    "client = Client(transport=transport, fetch_schema_from_transport=True)\n",
    "print(f\"Connected to Hasura at: {hasura_url}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic Queries\n",
    "\n",
    "Let's start with some basic queries to explore the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get count of files\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        files_enriched_aggregate {\n",
    "            aggregate {\n",
    "                count\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "file_count = result[\"files_enriched_aggregate\"][\"aggregate\"][\"count\"]\n",
    "print(f\"Total enriched files: {file_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample some file data\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        files_enriched(limit: 10) {\n",
    "            object_id\n",
    "            file_name\n",
    "            extension\n",
    "            size\n",
    "            magic_type\n",
    "            mime_type\n",
    "            is_plaintext\n",
    "            created_at\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "files_df = pd.DataFrame(result[\"files_enriched\"])\n",
    "print(\"Sample files:\")\n",
    "print(files_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Analysis Examples\n",
    "\n",
    "Let's analyze the file types and sizes in the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze file extensions\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        files_enriched {\n",
    "            extension\n",
    "            size\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "files_df = pd.DataFrame(result[\"files_enriched\"])\n",
    "\n",
    "# Plot file extensions\n",
    "plt.figure(figsize=(12, 6))\n",
    "extension_counts = files_df[\"extension\"].value_counts().head(10)\n",
    "extension_counts.plot(kind=\"bar\")\n",
    "plt.title(\"Top 10 File Extensions\")\n",
    "plt.xlabel(\"Extension\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze file sizes\n",
    "plt.figure(figsize=(12, 6))\n",
    "files_df[\"size_mb\"] = files_df[\"size\"] / (1024 * 1024)\n",
    "plt.hist(files_df[\"size_mb\"], bins=50, alpha=0.7)\n",
    "plt.title(\"File Size Distribution\")\n",
    "plt.xlabel(\"Size (MB)\")\n",
    "plt.ylabel(\"Frequency\")\n",
    "plt.yscale(\"log\")\n",
    "plt.show()\n",
    "\n",
    "print(f\"Average file size: {files_df['size_mb'].mean():.2f} MB\")\n",
    "print(f\"Median file size: {files_df['size_mb'].median():.2f} MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Findings Analysis\n",
    "\n",
    "Let's explore the security findings in the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample some findings data\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        findings_aggregate {\n",
    "            aggregate {\n",
    "                count\n",
    "            }\n",
    "        }\n",
    "        findings(limit: 10) {\n",
    "            finding_name\n",
    "            category\n",
    "            severity\n",
    "            origin_name\n",
    "            created_at\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "findings_count = result[\"findings_aggregate\"][\"aggregate\"][\"count\"]\n",
    "findings_df = pd.DataFrame(result[\"findings\"])\n",
    "\n",
    "print(f\"Total findings: {findings_count}\")\n",
    "print(\"\\nSample findings:\")\n",
    "print(findings_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze findings by category and severity\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        findings {\n",
    "            category\n",
    "            severity\n",
    "            finding_name\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "all_findings_df = pd.DataFrame(result[\"findings\"])\n",
    "\n",
    "# Plot findings by category\n",
    "plt.figure(figsize=(15, 5))\n",
    "\n",
    "plt.subplot(1, 3, 1)\n",
    "category_counts = all_findings_df[\"category\"].value_counts()\n",
    "category_counts.plot(kind=\"bar\")\n",
    "plt.title(\"Findings by Category\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "plt.subplot(1, 3, 2)\n",
    "severity_counts = all_findings_df[\"severity\"].value_counts()\n",
    "severity_counts.plot(kind=\"bar\")\n",
    "plt.title(\"Findings by Severity\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "plt.subplot(1, 3, 3)\n",
    "finding_name_counts = all_findings_df[\"finding_name\"].value_counts().head(10)\n",
    "finding_name_counts.plot(kind=\"bar\")\n",
    "plt.title(\"Top 10 Finding Types\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.xticks(rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Advanced Queries\n",
    "\n",
    "Here are some more advanced queries that join data across tables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Files with findings - joined query\n",
    "query = gql(\"\"\"\n",
    "    query {\n",
    "        files_enriched {\n",
    "            object_id\n",
    "            file_name\n",
    "            extension\n",
    "            size\n",
    "            findingsByObjectId {\n",
    "                finding_name\n",
    "                category\n",
    "                severity\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\"\"\")\n",
    "\n",
    "result = client.execute(query)\n",
    "files_with_findings = []\n",
    "\n",
    "for file_item in result[\"files_enriched\"]:\n",
    "    if file_item[\"findingsByObjectId\"]:\n",
    "        for finding in file_item[\"findingsByObjectId\"]:\n",
    "            files_with_findings.append(\n",
    "                {\n",
    "                    \"object_id\": file_item[\"object_id\"],\n",
    "                    \"file_name\": file_item[\"file_name\"],\n",
    "                    \"extension\": file_item[\"extension\"],\n",
    "                    \"size\": file_item[\"size\"],\n",
    "                    \"finding_name\": finding[\"finding_name\"],\n",
    "                    \"category\": finding[\"category\"],\n",
    "                    \"severity\": finding[\"severity\"],\n",
    "                }\n",
    "            )\n",
    "\n",
    "files_findings_df = pd.DataFrame(files_with_findings)\n",
    "print(f\"Files with findings: {len(files_findings_df)}\")\n",
    "print(files_findings_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Analysis\n",
    "\n",
    "This is where you can add your own custom queries and analysis!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add your custom analysis here\n",
    "print(\"Ready for your custom analysis!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
